knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)
library(tidygraph)
library(patchwork)
source(here::here("R", "utils.R"))
source(here::here("R", "differential_network_utils.R"))
input_path <- "/gstore/project/lineage/luli/for_timothy"
cluster_paths <-
dir(input_path, full.names = TRUE) |>
stringr::str_subset(pattern = "weights_")
tf_names_path <- file.path(input_path, "tf_peak_connections.rds")
tg_names_path <- file.path(input_path, "peak_gene_connections.rds")
# tf names
tf_names <-
tf_names_path |>
read_rds() |>
rownames()
# re names
re_names <-
tf_names_path |>
read_rds() |>
colnames()
tg_names <-
tg_names_path |>
read_rds() |>
colnames()
special_tfs <- c("GATA6", "FOXA1", "NKX2-1")
In this notebook, we _____.
Explicitly, the goals of these analyses are to _____.
# loads in u1 and v1
load(cluster_paths[[1]])
print(paste0("Dimensions for u1: [", dim(u1)[[1]], " x ", dim(u1)[[2]], "]"))
## [1] "Dimensions for u1: [1212 x 10758]"
print(paste0("Dimensions for v1: [", dim(v1)[[1]], " x ", dim(v1)[[2]], "]"))
## [1] "Dimensions for v1: [10758 x 2162]"
# loads in u2 and v2
load(cluster_paths[[2]])
print(paste0("Dimensions for u2: [", dim(u2)[[1]], " x ", dim(u2)[[2]], "]"))
## [1] "Dimensions for u2: [1212 x 10758]"
print(paste0("Dimensions for v2: [", dim(v2)[[1]], " x ", dim(v2)[[2]], "]"))
## [1] "Dimensions for v2: [10758 x 2162]"
# load in u3 and v3
load(cluster_paths[[3]])
print(paste0("Dimensions for u3: [", dim(u3)[[1]], " x ", dim(u3)[[2]], "]"))
## [1] "Dimensions for u3: [1212 x 10758]"
print(paste0("Dimensions for v3: [", dim(v3)[[1]], " x ", dim(v3)[[2]], "]"))
## [1] "Dimensions for v3: [10758 x 2162]"
# load in u4 and v4
load(cluster_paths[[4]])
print(paste0("Dimensions for u4: [", dim(u4)[[1]], " x ", dim(u4)[[2]], "]"))
## [1] "Dimensions for u4: [1212 x 10758]"
print(paste0("Dimensions for v4: [", dim(v4)[[1]], " x ", dim(v4)[[2]], "]"))
## [1] "Dimensions for v4: [10758 x 2162]"
# load in u5 and v5
load(cluster_paths[[5]])
print(paste0("Dimensions for u5: [", dim(u5)[[1]], " x ", dim(u5)[[2]], "]"))
## [1] "Dimensions for u5: [1212 x 10758]"
print(paste0("Dimensions for v5: [", dim(v5)[[1]], " x ", dim(v5)[[2]], "]"))
## [1] "Dimensions for v5: [10758 x 2162]"
# load in u6 and v6
load(cluster_paths[[6]])
print(paste0("Dimensions for u6: [", dim(u6)[[1]], " x ", dim(u6)[[2]], "]"))
## [1] "Dimensions for u6: [1212 x 10758]"
print(paste0("Dimensions for v6: [", dim(v6)[[1]], " x ", dim(v6)[[2]], "]"))
## [1] "Dimensions for v6: [10758 x 2162]"
Put all u’s and v’s into a single data structure that is easy to manipulate
clusters <-
tibble::tibble(
cluster = paste0("cluster_", 1:6),
reprogrammed_tf =
c(
"GATA6", "unknown", "NKX2-1",
"FOXA1", "unreprogrammed", "unreprogrammed"
),
u = list(u1, u2, u3, u4, u5, u6),
v = list(v1, v2, v3, v4, v5, v6)
) |>
# convert adjacent matrices into edge lists
mutate(
u_edge_list =
map(.x = u, .f = clean_u, tf_names = tf_names, re_names = re_names),
v_edge_list =
map(.x = v, .f = clean_v, re_names = re_names, tg_names = tg_names)
)
# TF -> RE edge list of the 5th cluster
u_edge_list_5 <-
clusters |>
filter(cluster == "cluster_5") |>
pull(u_edge_list) |>
pluck(1)
# RE -> TG edge list of the 5th cluster
v_edge_list_5 <-
clusters |>
filter(cluster == "cluster_5") |>
pull(v_edge_list) |>
pluck(1)
# difference graphs with cluster 5 as the baseline
cluster_5_diff_graphs <-
map2(
.x = clusters$u_edge_list,
.y = clusters$v_edge_list,
.f = ~
build_difference_graph(
u_1 = u_edge_list_5,
u_2 = .x,
v_1 = v_edge_list_5,
v_2 = .y
)
)
# TF -> RE edge list of the 6th cluster
u_edge_list_6 <-
clusters |>
filter(cluster == "cluster_6") |>
pull(u_edge_list) |>
pluck(1)
# RE -> TG edge list of the 6th cluster
v_edge_list_6 <-
clusters |>
filter(cluster == "cluster_6") |>
pull(v_edge_list) |>
pluck(1)
# difference graphs with cluster 6 as the baseline
cluster_6_diff_graphs <-
map2(
.x = clusters$u_edge_list,
.y = clusters$v_edge_list,
.f = ~
build_difference_graph(
u_1 = u_edge_list_6,
u_2 = .x,
v_1 = v_edge_list_6,
v_2 = .y
)
)
# individual graphs for each cluster
individual_graphs <-
map2(
.x = clusters$u_edge_list,
.y = clusters$v_edge_list,
.f = ~ build_graph(u = .x, v = .y)
)
# add difference graphs and individual graphs for each cluster to the clusters tibble
clusters <-
clusters |>
mutate(
individual_graphs = individual_graphs,
cluster_5_diff_graphs = cluster_5_diff_graphs,
cluster_6_diff_graphs = cluster_6_diff_graphs
)
#centralities for each TF in the edge-subtracted graphs
diff_centralities_5 <-
cluster_5_diff_graphs |>
map(.f = calculate_tf_centralities)
diff_centralities_6 <-
cluster_6_diff_graphs |>
map(.f = calculate_tf_centralities)
# centralities in each of the cluster-specific graphs
centralities <-
individual_graphs |>
map(.f = calculate_tf_centralities)
weighted_centralities <-
individual_graphs |>
map(.f = calculate_tf_centralities, weights = abs(weight))
# add centralities
clusters$centralities_5 <- diff_centralities_5
clusters$centralities_6 <- diff_centralities_6
clusters$centralities <- centralities
clusters$weighted_centralities <- weighted_centralities
# combine all degree information for all TFs across different graphs
# placeholder for difference graphs between clusters 5/6 and themselves
empty_centrality_tibble <-
diff_centralities_5[[6]] |>
mutate(degree = 0)
centrality_tibble <-
# combine edge-subtracted centralities foe each cluster
# compared to clusters 5 and 6
tibble::tibble(
cluster = paste0("cluster_", 1:length(diff_centralities_5)),
diff_centralities_5 = diff_centralities_5,
diff_centralities_6 = diff_centralities_6
) |>
mutate(
diff_centralities_5 =
if_else(
map_lgl(.x = diff_centralities_5, .f = ~ nrow(.x) == 0),
list(empty_centrality_tibble),
diff_centralities_5
) |>
map(.f = ~transmute(.x, node_name, degree_5 = degree)),
diff_centralities_6 =
if_else(
map_lgl(.x = diff_centralities_6, .f = ~ nrow(.x) == 0),
list(empty_centrality_tibble),
diff_centralities_6
) |>
map(.f = ~transmute(.x, node_name, degree_6 = degree))
) |>
# add the centralities of each cluster in their base network
transmute(
cluster,
centralities =
map2(
.x = diff_centralities_5,
.y = diff_centralities_6,
.f = ~ left_join(.x, .y, by = "node_name")
) |>
map2(.y = centralities, .f = ~ left_join(.x, .y, by = "node_name"))
)
# print
centrality_tibble |>
unnest()
## # A tibble: 7,272 × 6
## cluster node_name degree_5 degree_6 node_type degree
## <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 cluster_1 RAC3 6426 6710 tf 4821
## 2 cluster_1 POLR2A 5988 6243 tf 4552
## 3 cluster_1 MIER1 5944 5994 tf 4409
## 4 cluster_1 ESR1 5893 6183 tf 4474
## 5 cluster_1 MAX 5804 5994 tf 4412
## 6 cluster_1 RAD51 5695 5919 tf 4508
## 7 cluster_1 SOX6 5687 6023 tf 4283
## 8 cluster_1 NFATC3 5602 5794 tf 4239
## 9 cluster_1 RAD21 5563 5743 tf 4233
## 10 cluster_1 CTCF 5505 5687 tf 4264
## # … with 7,262 more rows
If we look at GATA6’s differential activity score betweeen clusters 1 (reprogrammed with GATA6) and 5/6 (non-reprogrammed), we can see that GATA6’s differential activity score is relatively low compared to the full distribution.
# difference between graph 5 and graph 1
gata6_degree <-
diff_centralities_5[[1]] |>
filter(node_name == "GATA6") |>
pull(degree)
diff_centralities_5[[1]] |>
mutate(
node_name = fct_reorder(node_name, degree),
is_gata6 = if_else(node_name == "GATA6", "GATA6", "Not GATA6")
) |>
ggplot(aes(x = degree)) +
geom_histogram(binwidth = 100) +
geom_vline(xintercept = gata6_degree) +
theme_bw() +
labs(
subtitle = "Cluster 1 (GATA-6) vs. Cluster 5 (unreprogrammed)",
x = "Degree in the edge-subtracted network",
y = "Number of TFs",
caption = "Black vertical line indicates the location of GATA-6"
)
gata6_degree <-
diff_centralities_6[[1]] |>
filter(node_name == "GATA6") |>
pull(degree)
diff_centralities_6[[1]] |>
mutate(
node_name = fct_reorder(node_name, degree),
is_gata6 = if_else(node_name == "GATA6", "GATA6", "Not GATA6")
) |>
ggplot(aes(x = degree)) +
geom_histogram(binwidth = 100) +
geom_vline(xintercept = gata6_degree) +
theme_bw() +
labs(
subtitle = "Cluster 1 (GATA-6) vs. Cluster 6 (unreprogrammed)",
x = "Degree in the edge-subtracted network",
y = "Number of TFs",
caption = "Black vertical line indicates the location of GATA-6"
)
Does this mean that GATA6 activity is not very different between clusters 1 and 5/6? Maybe, but one thing we might notice from the histograms above is the wide range of degrees in the edge-subtracted network among TFs. So we might be worried about two issues:
We explore these two issues in the sections below.
To look into the first possibility, we can plot a scatterplot of a TF’s average degree across both clusters and its degree in the edge-subtracted network.
centralities_1 <-
centralities[[1]] |>
rename(degree_1 = degree)
centralities_5 <- centralities[[5]] |>
rename(degree_5 = degree)
centralities_6 <-
centralities[[6]] |>
rename(degree_6 = degree)
centralities_combined <-
centralities_1 |>
left_join(centralities_5, by = c("node_name", "node_type")) |>
left_join(centralities_6, by = c("node_name", "node_type")) |>
mutate(degree_mean = (degree_1 + degree_5 + degree_6) / 3)
plot_tibble <-
centralities_combined |>
left_join(
diff_centralities_5[[1]],
by = c("node_name", "node_type")
)
correlation <-
cor(x = plot_tibble$degree, y = plot_tibble$degree_mean) |>
round(4)
plot_tibble |>
ggplot(aes(x = degree_mean, y = degree)) +
geom_point() +
theme_bw() +
labs(
y = "Degree in edge-subtracted network",
x = "Average degree in the original networks",
caption = paste0("Correlation: ", correlation)
)
From this plot, we can see that there is a huge correlation between a TF’s degree in the edge-subtracted network and its degree in the original networks (averaged). So, we can consider normalizing the score of how differential each TF is by some metric of its degree in both networks being compared (i.e. the average of the two networks).
We can also plot what the differential score result would be using the “centrality subtraction” method (as opposed to the “edge subtraction” method above), and we can see that a version of the same problem is present (though to a lesser extent).
# brief investigation into the centrality subtraction case (as opposed to edge
# subtraction)
plot_tibble <-
make_predictions(
centralities_1 = rename(centralities_5, degree = degree_5),
centralities_2 = rename(centralities_1, degree = degree_1)
) |>
left_join(centralities_combined, by = c("node_name"))
correlation <-
cor(x = plot_tibble$degree_mean, y = plot_tibble$degree_diff) |>
round(4)
plot_tibble |>
ggplot(aes(x = degree_mean, y = degree_diff)) +
geom_point() +
theme_bw() +
labs(caption = paste0("Correlation: ", correlation))
So, we must find some way to normalize the differential “score” for each TF based on its degree in the baseline network(s).
# test with clusters 1 and 5 again
# find mean centralities for each TF in both clusters 1 and 5
baseline_centralities_15 <-
centrality_tibble |>
unnest(cols = centralities) |>
filter(cluster %in% paste0("cluster_", c(1, 5))) |>
group_by(node_name) |>
summarize(mean_degree = mean(degree)) |>
arrange(-mean_degree) |>
ungroup()
baseline_centralities_15
## # A tibble: 1,212 × 2
## node_name mean_degree
## <chr> <dbl>
## 1 RAC3 4803
## 2 POLR2A 4479
## 3 ESR1 4428.
## 4 MIER1 4400.
## 5 MAX 4346.
## 6 RAD51 4319
## 7 SOX6 4238
## 8 NFATC3 4163
## 9 RAD21 4154.
## 10 CTCF 4144
## # … with 1,202 more rows
# normalize the edge-subtracted centrality for cluster 1 and 5 by the mean values
# TO DO: Make sure that this matches with new normalize_predictions function
normalized_predictions_15_old <-
centrality_tibble |>
unnest(cols = centralities) |>
filter(cluster == "cluster_1") |>
left_join(baseline_centralities_15, by = "node_name") |>
transmute(
node_name,
degree,
degree_5,
mean_degree,
normalized_degree = degree_5 / mean_degree
) |>
arrange(-degree)
normalized_centralities_15 <-
clusters |>
filter(cluster == "cluster_1") |>
pull(centralities_5) |>
pluck(1) |>
normalize_predictions(
cluster_1_centralities = clusters$centralities[[1]],
cluster_2_centralities = clusters$centralities[[5]]
) |>
arrange(-normalized_degree) |>
rename(degree = normalized_degree)
normalized_centralities_15 |>
mutate(rank = rank(-degree)) |>
filter(node_name == "GATA6")
## # A tibble: 1 × 3
## node_name degree rank
## <chr> <dbl> <dbl>
## 1 GATA6 1.34 188
# plot correlation
plot_tibble <-
normalized_centralities_15 |>
left_join(centralities_combined, by = "node_name")
correlation <-
cor(x = plot_tibble$degree, y = plot_tibble$degree_mean) |>
round(4)
plot_tibble |>
ggplot(aes(x = degree, y = degree_mean)) +
geom_point(alpha = 0.25) +
theme_bw() +
labs(
x = "Normalized degree in edge-subtracted network",
y = "Average degree in clusters 1 and 5",
caption = paste0("Correlation: ", correlation)
)
normalized_centralities_16 <-
clusters |>
filter(cluster == "cluster_1") |>
pull(centralities_6) |>
pluck(1) |>
normalize_predictions(
cluster_1_centralities = clusters$centralities[[1]],
cluster_2_centralities = clusters$centralities[[5]]
) |>
arrange(-normalized_degree) |>
rename(degree = normalized_degree)
normalized_centralities_15 |>
mutate(rank = rank(-degree)) |>
filter(node_name == "GATA6")
## # A tibble: 1 × 3
## node_name degree rank
## <chr> <dbl> <dbl>
## 1 GATA6 1.34 188
# plot correlation
plot_tibble <-
normalized_centralities_16 |>
left_join(centralities_combined, by = "node_name")
correlation <-
cor(x = plot_tibble$degree, y = plot_tibble$degree_mean) |>
round(4)
plot_tibble |>
ggplot(aes(x = degree, y = degree_mean)) +
geom_point(alpha = 0.25) +
theme_bw() +
labs(
x = "Normalized degree in edge-subtracted network",
y = "Average degree in clusters 1 and 6",
caption = paste0("Correlation: ", correlation)
)
# find the deciles of the weights in the
# difference graph between cluster 1 and cluster 5
quantiles_51 <-
clusters$cluster_5_diff_graphs[[1]] |>
activate("edges") |>
as_tibble() |>
pull(weight) |>
abs() |>
quantile(probs = seq(0, 1, 0.1)) |>
enframe() |>
rename(quantile = name)
print(quantiles_51)
## # A tibble: 11 × 2
## quantile value
## <chr> <dbl>
## 1 0% 1.96e-13
## 2 10% 6.43e- 7
## 3 20% 1.02e- 4
## 4 30% 5.19e- 4
## 5 40% 1.41e- 3
## 6 50% 3.12e- 3
## 7 60% 6.35e- 3
## 8 70% 1.29e- 2
## 9 80% 2.80e- 2
## 10 90% 7.72e- 2
## 11 100% 1.28e+ 2
# plot the distribution of weights
clusters$cluster_5_diff_graphs[[1]] |>
activate("edges") |>
as_tibble() |>
arrange(abs(weight)) |>
mutate(
weight_sign = sign(weight),
log_weight = log10(abs(weight)),
#log_weight = weight_sign * log_weight
) |>
ggplot(aes(x = log_weight)) +
geom_histogram(bins = 35) +
geom_vline(xintercept = log10(1e-6), linetype = "dashed") +
theme_bw() +
labs(
subtitle = "Cluster 5 vs. Cluster 1",
x = "abs(weight); log10 scale",
caption = "Dashed line indicates Luli's threshold"
)
# find the deciles of the weights in the
# difference graph between cluster 5 and cluster 6
quantiles_56 <-
cluster_5_diff_graphs[[6]] |>
activate("edges") |>
as_tibble() |>
pull(weight) |>
abs() |>
quantile(probs = seq(0, 1, 0.1)) |>
enframe() |>
rename(quantile = name)
print(quantiles_56)
## # A tibble: 11 × 2
## quantile value
## <chr> <dbl>
## 1 0% 6.18e-14
## 2 10% 4.71e- 7
## 3 20% 7.00e- 5
## 4 30% 3.84e- 4
## 5 40% 1.11e- 3
## 6 50% 2.55e- 3
## 7 60% 5.35e- 3
## 8 70% 1.11e- 2
## 9 80% 2.47e- 2
## 10 90% 7.13e- 2
## 11 100% 1.28e+ 2
# plot weight distributions
cluster_5_diff_graphs[[6]] |>
activate("edges") |>
as_tibble() |>
arrange(-weight) |>
mutate(
weight_sign = sign(weight),
log_weight = log10(abs(weight)),
) |>
ggplot(aes(x = log_weight)) +
geom_histogram(bins = 35) +
geom_vline(xintercept = -6, linetype = "dashed") +
theme_bw() +
labs(
subtitle = "Cluster 5 vs. Cluster 6", x = "abs(weight); log10 scale",
caption = "Vertical line indicates Luli's threshold"
)
One way to address this issue would be to threshold.
So, where do we draw the threshold? Maybe the median? Another quantile?
unfiltered_graph_51 <-
clusters$cluster_5_diff_graphs[[1]]
filtered_graph_51 <-
clusters$cluster_5_diff_graphs[[1]] |>
activate("edges") |>
#filter(abs(weight) > quantile(abs(weight), probs = 0.15)) |>
filter(abs(weight) > 1e-3) |>
activate("nodes")
filtered_graph_51
## # A tbl_graph: 2749 nodes and 1718614 edges
## #
## # A directed acyclic multigraph with 82 components
## #
## # Node Data: 2,749 × 2 (active)
## node_type node_name
## <chr> <chr>
## 1 tf ADNP
## 2 tg RPL22_gene
## 3 tg ERRFI1_gene
## 4 tg TNFRSF9_gene
## 5 tg SPSB1_gene
## 6 tg NPPB_gene
## # … with 2,743 more rows
## #
## # Edge Data: 1,718,614 × 5
## from to weight_1 weight_2 weight
## <int> <int> <dbl> <dbl> <dbl>
## 1 1 2 -0.00895 0 0.00895
## 2 1 2 0.00794 0.0238 0.0159
## 3 1 2 0.00717 -0.00000352 -0.00717
## # … with 1,718,611 more rows
# calculate the ranking of the 3 special TFs in the normalized, filtered
# edge-subtracted graph
special_tfs_51_filtered <-
filtered_graph_51 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
mutate(rank = rank(-normalized_degree)) |>
filter(node_name %in% special_tfs)
special_tfs_51_unfiltered <-
unfiltered_graph_51 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
mutate(rank = rank(-normalized_degree)) |>
filter(node_name %in% special_tfs)
special_tfs_51_filtered
## # A tibble: 3 × 3
## node_name normalized_degree rank
## <chr> <dbl> <dbl>
## 1 GATA6 0.892 25
## 2 NKX2-1 0.826 1059
## 3 FOXA1 0.784 1194
special_tfs_51_unfiltered
## # A tibble: 3 × 3
## node_name normalized_degree rank
## <chr> <dbl> <dbl>
## 1 NKX2-1 1.35 53
## 2 GATA6 1.34 188
## 3 FOXA1 1.31 1037
# plot histogram of all TFs
## filtered
filtered_graph_51 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
mutate(rank = rank(-normalized_degree)) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_51_filtered
) +
theme_bw() +
labs(
subtitle = "Filtered graph; Cluster 1 (GATA-6) vs Cluster 5",
color = NULL
)
## unfiltered
unfiltered_graph_51 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
mutate(rank = rank(-normalized_degree)) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_51_unfiltered
) +
theme_bw() +
labs(
subtitle = "Unfiltered graph; Cluster 1 (GATA-6) vs Cluster 5",
color = NULL
)
# plot 50 most differential TFs
# plot normalized centralities of most differential TFs
unfiltered_graph_51 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf = if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 1 (vs. Cluster 5); unfiltered",
x = "Normalized degree",
y = NULL,
fill = NULL
)
filtered_graph_51 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf = if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 1 (vs. Cluster 5); filtered",
x = "Normalized degree",
y = NULL,
fill = NULL
)
This strategy seems to introduce another issue, which is that TFs with a really small number of edges in the filtered graph can have unstable normalized degree scores. Note that several of the “most differential” TFs above have very low degrees in the original networks:
clusters$individual_graphs[[1]] |>
calculate_tf_centralities() |>
arrange(degree)
## # A tibble: 1,212 × 3
## node_name node_type degree
## <chr> <chr> <dbl>
## 1 BANF1 tf 1
## 2 PSIP1 tf 2
## 3 BATF3 tf 8
## 4 ATM tf 15
## 5 ZNF544 tf 18
## 6 H2AFX tf 22
## 7 ZNF354C tf 25
## 8 ZNF555 tf 25
## 9 ZNF670 tf 29
## 10 ZNF214 tf 31
## # … with 1,202 more rows
One way to deal with this is to only consider TFs with more than a certain number of edges (in the code below, above the 1st percentile of all TFs in the network):
filtered_graph_51 |>
calculate_tf_centralities() |>
filter(degree >= quantile(degree, probs = 0.01)) |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf = if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 1 (vs. Cluster 5); filtered",
x = "Normalized degree",
y = NULL,
fill = NULL
)
unfiltered_graph_61 <-
clusters$cluster_6_diff_graphs[[1]]
filtered_graph_61 <-
clusters$cluster_6_diff_graphs[[1]] |>
activate("edges") |>
#filter(abs(weight) > quantile(abs(weight), probs = 0.15)) |>
filter(abs(weight) > 1e-3) |>
activate("nodes")
filtered_graph_61
## # A tbl_graph: 2751 nodes and 1740605 edges
## #
## # A directed acyclic multigraph with 70 components
## #
## # Node Data: 2,751 × 2 (active)
## node_type node_name
## <chr> <chr>
## 1 tf ADNP
## 2 tg RPL22_gene
## 3 tg ERRFI1_gene
## 4 tg TNFRSF9_gene
## 5 tg NPPB_gene
## 6 tg EIF4G3_gene
## # … with 2,745 more rows
## #
## # Edge Data: 1,740,605 × 5
## from to weight_1 weight_2 weight
## <int> <int> <dbl> <dbl> <dbl>
## 1 1 2 -0.00750 0 0.00750
## 2 1 2 0.0402 0.0238 -0.0164
## 3 1 2 -0.0662 -0.00000352 0.0662
## # … with 1,740,602 more rows
# calculate the ranking of the 3 special TFs in the normalized, filtered
# edge-subtracted graph
special_tfs_61_filtered <-
filtered_graph_61 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
mutate(rank = rank(-normalized_degree)) |>
filter(node_name %in% special_tfs)
special_tfs_61_unfiltered <-
unfiltered_graph_61 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
mutate(rank = rank(-normalized_degree)) |>
filter(node_name %in% special_tfs)
special_tfs_61_filtered
## # A tibble: 3 × 3
## node_name normalized_degree rank
## <chr> <dbl> <dbl>
## 1 GATA6 0.923 19
## 2 NKX2-1 0.863 483
## 3 FOXA1 0.858 596
special_tfs_61_unfiltered
## # A tibble: 3 × 3
## node_name normalized_degree rank
## <chr> <dbl> <dbl>
## 1 FOXA1 1.42 26
## 2 NKX2-1 1.41 45
## 3 GATA6 1.39 157
# plot histogram of all TFs
## filtered
filtered_graph_61 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
mutate(rank = rank(-normalized_degree)) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_61_filtered
) +
theme_bw() +
labs(
subtitle = "Filtered graph; Cluster 1 (GATA-6) vs Cluster 6",
color = NULL
)
## unfiltered
unfiltered_graph_61 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
mutate(rank = rank(-normalized_degree)) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_61_unfiltered
) +
theme_bw() +
labs(
subtitle = "Unfiltered graph; Cluster 1 (GATA-6) vs Cluster 6",
color = NULL
)
# plot 50 most differential TFs
# plot normalized centralities of most differential TFs
unfiltered_graph_61 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf = if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 1 (vs. Cluster 6); unfiltered",
x = "Normalized degree",
y = NULL,
fill = NULL
)
filtered_graph_61 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf = if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 1 (vs. Cluster 6); filtered",
x = "Normalized degree",
y = NULL,
fill = NULL
)
filtered_graph_61 |>
calculate_tf_centralities() |>
filter(degree >= quantile(degree, probs = 0.01)) |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf = if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 1 (vs. Cluster 6); filtered",
x = "Normalized degree",
y = NULL,
fill = NULL
)
unfiltered_graph_45 <-
clusters$cluster_5_diff_graphs[[4]]
filtered_graph_45 <-
clusters$cluster_5_diff_graphs[[4]] |>
activate("edges") |>
#filter(abs(weight) > quantile(abs(weight), probs = 0.15)) |>
filter(abs(weight) > 1e-3) |>
activate("nodes")
filtered_graph_45
## # A tbl_graph: 2696 nodes and 1612871 edges
## #
## # A directed acyclic multigraph with 103 components
## #
## # Node Data: 2,696 × 2 (active)
## node_type node_name
## <chr> <chr>
## 1 tf ADNP
## 2 tg RPL22_gene
## 3 tg ERRFI1_gene
## 4 tg TNFRSF9_gene
## 5 tg SPSB1_gene
## 6 tg NPPB_gene
## # … with 2,690 more rows
## #
## # Edge Data: 1,612,871 × 5
## from to weight_1 weight_2 weight
## <int> <int> <dbl> <dbl> <dbl>
## 1 1 2 -0.00895 -0.00142 0.00753
## 2 1 2 0.00794 0 -0.00794
## 3 1 2 0.00717 0.0271 0.0199
## # … with 1,612,868 more rows
# calculate the ranking of the 3 special TFs in the normalized, filtered
# edge-subtracted graph
special_tfs_45_filtered <-
filtered_graph_45 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[4]]
) |>
mutate(rank = rank(-normalized_degree)) |>
filter(node_name %in% special_tfs)
special_tfs_45_unfiltered <-
unfiltered_graph_45 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[4]]
) |>
mutate(rank = rank(-normalized_degree)) |>
filter(node_name %in% special_tfs)
special_tfs_45_filtered
## # A tibble: 3 × 3
## node_name normalized_degree rank
## <chr> <dbl> <dbl>
## 1 GATA6 0.836 212
## 2 NKX2-1 0.804 949
## 3 FOXA1 0.787 1135
special_tfs_45_unfiltered
## # A tibble: 3 × 3
## node_name normalized_degree rank
## <chr> <dbl> <dbl>
## 1 GATA6 1.32 289
## 2 NKX2-1 1.32 473
## 3 FOXA1 1.31 699
# plot histogram of all TFs
## filtered
filtered_graph_45 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[4]]
) |>
mutate(rank = rank(-normalized_degree)) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_45_filtered
) +
theme_bw() +
labs(
subtitle = "Filtered graph; Cluster 4 (FOXA1/FOXA2) vs Cluster 5",
color = NULL
)
## unfiltered
unfiltered_graph_45 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[4]]
) |>
mutate(rank = rank(-normalized_degree)) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_45_unfiltered
) +
theme_bw() +
labs(
subtitle = "Unfiltered graph; Cluster 4 (FOXA1/FOXA2) vs Cluster 5",
color = NULL
)
# plot 50 most differential TFs
# plot normalized centralities of most differential TFs
unfiltered_graph_45 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[4]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf = if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 4 (vs. Cluster 5); unfiltered",
x = "Normalized degree",
y = NULL,
fill = NULL
)
filtered_graph_45 |>
calculate_tf_centralities() |>
filter(degree >= quantile(degree, probs = 0.01)) |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[4]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf = if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 4 (vs. Cluster 5); filtered",
x = "Normalized degree",
y = NULL,
fill = NULL
)
unfiltered_graph_46 <-
clusters$cluster_6_diff_graphs[[4]]
filtered_graph_46 <-
clusters$cluster_6_diff_graphs[[4]] |>
activate("edges") |>
#filter(abs(weight) > quantile(abs(weight), probs = 0.15)) |>
filter(abs(weight) > 1e-3) |>
activate("nodes")
filtered_graph_46
## # A tbl_graph: 2711 nodes and 1594085 edges
## #
## # A directed acyclic multigraph with 109 components
## #
## # Node Data: 2,711 × 2 (active)
## node_type node_name
## <chr> <chr>
## 1 tf ADNP
## 2 tg RPL22_gene
## 3 tg ERRFI1_gene
## 4 tg TNFRSF9_gene
## 5 tg NPPB_gene
## 6 tg EIF4G3_gene
## # … with 2,705 more rows
## #
## # Edge Data: 1,594,085 × 5
## from to weight_1 weight_2 weight
## <int> <int> <dbl> <dbl> <dbl>
## 1 1 2 -0.00750 -0.00142 0.00608
## 2 1 2 0.0402 0 -0.0402
## 3 1 2 -0.0662 0.0271 0.0933
## # … with 1,594,082 more rows
# calculate the ranking of the 3 special TFs in the normalized, filtered
# edge-subtracted graph
special_tfs_46_filtered <-
filtered_graph_46 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[6]],
cluster_2_centralities = clusters$centralities[[4]]
) |>
mutate(rank = rank(-normalized_degree)) |>
filter(node_name %in% special_tfs)
special_tfs_46_unfiltered <-
unfiltered_graph_46 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[6]],
cluster_2_centralities = clusters$centralities[[4]]
) |>
mutate(rank = rank(-normalized_degree)) |>
filter(node_name %in% special_tfs)
special_tfs_46_filtered
## # A tibble: 3 × 3
## node_name normalized_degree rank
## <chr> <dbl> <dbl>
## 1 GATA6 0.849 22
## 2 NKX2-1 0.790 604
## 3 FOXA1 0.767 1035
special_tfs_46_unfiltered
## # A tibble: 3 × 3
## node_name normalized_degree rank
## <chr> <dbl> <dbl>
## 1 NKX2-1 1.32 56
## 2 GATA6 1.31 104
## 3 FOXA1 1.29 772
# plot histogram of all TFs
## filtered
filtered_graph_46 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[6]],
cluster_2_centralities = clusters$centralities[[4]]
) |>
mutate(rank = rank(-normalized_degree)) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_46_filtered
) +
theme_bw() +
labs(
subtitle = "Filtered graph; Cluster 4 (FOXA1/FOXA2) vs Cluster 6",
color = NULL
)
## unfiltered
unfiltered_graph_46 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[6]],
cluster_2_centralities = clusters$centralities[[4]]
) |>
mutate(rank = rank(-normalized_degree)) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_46_unfiltered
) +
theme_bw() +
labs(
subtitle = "Unfiltered graph; Cluster 4 (FOXA1/FOXA2) vs Cluster 6",
color = NULL
)
# plot 50 most differential TFs
# plot normalized centralities of most differential TFs
unfiltered_graph_46 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[6]],
cluster_2_centralities = clusters$centralities[[4]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf = if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 4 (vs. Cluster 6); unfiltered",
x = "Normalized degree",
y = NULL,
fill = NULL
)
filtered_graph_46 |>
calculate_tf_centralities() |>
filter(degree >= quantile(degree, probs = 0.01)) |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[6]],
cluster_2_centralities = clusters$centralities[[4]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf = if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 4 (vs. Cluster 6); filtered",
x = "Normalized degree",
y = NULL,
fill = NULL
)
unfiltered_graph_35 <-
clusters$cluster_5_diff_graphs[[3]]
filtered_graph_35 <-
clusters$cluster_5_diff_graphs[[3]] |>
activate("edges") |>
#filter(abs(weight) > quantile(abs(weight), probs = 0.15)) |>
filter(abs(weight) > 1e-3) |>
activate("nodes")
filtered_graph_35
## # A tbl_graph: 2774 nodes and 1955618 edges
## #
## # A directed acyclic multigraph with 43 components
## #
## # Node Data: 2,774 × 2 (active)
## node_type node_name
## <chr> <chr>
## 1 tf ADNP
## 2 tg RPL22_gene
## 3 tg ERRFI1_gene
## 4 tg TNFRSF9_gene
## 5 tg SPSB1_gene
## 6 tg NPPB_gene
## # … with 2,768 more rows
## #
## # Edge Data: 1,955,618 × 5
## from to weight_1 weight_2 weight
## <int> <int> <dbl> <dbl> <dbl>
## 1 1 2 -0.00895 -0.00145 0.00750
## 2 1 2 0.00794 0.0123 0.00431
## 3 1 2 0.00717 0.000641 -0.00652
## # … with 1,955,615 more rows
# calculate the ranking of the 3 special TFs in the normalized, filtered
# edge-subtracted graph
special_tfs_35_filtered <-
filtered_graph_35 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[3]]
) |>
mutate(rank = rank(-normalized_degree)) |>
filter(node_name %in% special_tfs)
special_tfs_35_unfiltered <-
unfiltered_graph_35 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[3]]
) |>
mutate(rank = rank(-normalized_degree)) |>
filter(node_name %in% special_tfs)
special_tfs_35_filtered
## # A tibble: 3 × 3
## node_name normalized_degree rank
## <chr> <dbl> <dbl>
## 1 FOXA1 0.887 209
## 2 GATA6 0.864 839
## 3 NKX2-1 0.851 1098
special_tfs_35_unfiltered
## # A tibble: 3 × 3
## node_name normalized_degree rank
## <chr> <dbl> <dbl>
## 1 FOXA1 1.40 103
## 2 NKX2-1 1.39 201
## 3 GATA6 1.37 578
# plot histogram of all TFs
## filtered
filtered_graph_35 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[3]]
) |>
mutate(rank = rank(-normalized_degree)) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_35_filtered
) +
theme_bw() +
labs(
subtitle = "Filtered graph; Cluster 3 (NKX2-1) vs Cluster 5",
color = NULL
)
## unfiltered
unfiltered_graph_35 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[3]]
) |>
mutate(rank = rank(-normalized_degree)) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_35_unfiltered
) +
theme_bw() +
labs(
subtitle = "Unfiltered graph; Cluster 3 (NKX2-1) vs Cluster 5",
color = NULL
)
# plot 50 most differential TFs
# plot normalized centralities of most differential TFs
unfiltered_graph_35 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[3]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf = if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 3 (vs. Cluster 5); unfiltered",
x = "Normalized degree",
y = NULL,
fill = NULL
)
filtered_graph_35 |>
calculate_tf_centralities() |>
filter(degree >= quantile(degree, probs = 0.01)) |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[4]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf = if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 3 (vs. Cluster 5); filtered",
x = "Normalized degree",
y = NULL,
fill = NULL
)
unfiltered_graph_36 <-
clusters$cluster_6_diff_graphs[[3]]
filtered_graph_36 <-
clusters$cluster_6_diff_graphs[[3]] |>
activate("edges") |>
#filter(abs(weight) > quantile(abs(weight), probs = 0.15)) |>
filter(abs(weight) > 1e-3) |>
activate("nodes")
filtered_graph_36
## # A tbl_graph: 2778 nodes and 1967070 edges
## #
## # A directed acyclic multigraph with 42 components
## #
## # Node Data: 2,778 × 2 (active)
## node_type node_name
## <chr> <chr>
## 1 tf ADNP
## 2 tg RPL22_gene
## 3 tg ERRFI1_gene
## 4 tg TNFRSF9_gene
## 5 tg NPPB_gene
## 6 tg EIF4G3_gene
## # … with 2,772 more rows
## #
## # Edge Data: 1,967,070 × 5
## from to weight_1 weight_2 weight
## <int> <int> <dbl> <dbl> <dbl>
## 1 1 2 -0.00750 -0.00145 0.00605
## 2 1 2 0.0402 0.0123 -0.0279
## 3 1 2 -0.0662 0.000641 0.0669
## # … with 1,967,067 more rows
# calculate the ranking of the 3 special TFs in the normalized, filtered
# edge-subtracted graph
special_tfs_36_filtered <-
filtered_graph_36 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[6]],
cluster_2_centralities = clusters$centralities[[3]]
) |>
mutate(rank = rank(-normalized_degree)) |>
filter(node_name %in% special_tfs)
special_tfs_36_unfiltered <-
unfiltered_graph_36 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[6]],
cluster_2_centralities = clusters$centralities[[3]]
) |>
mutate(rank = rank(-normalized_degree)) |>
filter(node_name %in% special_tfs)
special_tfs_36_filtered
## # A tibble: 3 × 3
## node_name normalized_degree rank
## <chr> <dbl> <dbl>
## 1 GATA6 0.887 76
## 2 FOXA1 0.870 257
## 3 NKX2-1 0.859 548
special_tfs_36_unfiltered
## # A tibble: 3 × 3
## node_name normalized_degree rank
## <chr> <dbl> <dbl>
## 1 NKX2-1 1.39 111
## 2 FOXA1 1.38 168
## 3 GATA6 1.37 494
# plot histogram of all TFs
## filtered
filtered_graph_36 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[6]],
cluster_2_centralities = clusters$centralities[[3]]
) |>
mutate(rank = rank(-normalized_degree)) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_36_filtered
) +
theme_bw() +
labs(
subtitle = "Filtered graph; Cluster 3 (NKX2-1) vs Cluster 6",
color = NULL
)
## unfiltered
unfiltered_graph_36 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[6]],
cluster_2_centralities = clusters$centralities[[3]]
) |>
mutate(rank = rank(-normalized_degree)) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_36_unfiltered
) +
theme_bw() +
labs(
subtitle = "Unfiltered graph; Cluster 3 (NKX2-1) vs Cluster 6",
color = NULL
)
# plot 50 most differential TFs
# plot normalized centralities of most differential TFs
unfiltered_graph_36 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[6]],
cluster_2_centralities = clusters$centralities[[3]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf = if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 3 (vs. Cluster 6); unfiltered",
x = "Normalized degree",
y = NULL,
fill = NULL
)
filtered_graph_36 |>
calculate_tf_centralities() |>
filter(degree >= quantile(degree, probs = 0.01)) |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[6]],
cluster_2_centralities = clusters$centralities[[4]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf = if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 3 (vs. Cluster 6); filtered",
x = "Normalized degree",
y = NULL,
fill = NULL
)
The other possible solution is to avoid thresholding edges entirely and simply sum their weights (for each TF) rather than counting them (setting each edge over the threshold’s weight equal to 1).
# calculate the ranking of the 3 special TFs in the normalized
# edge-subtracted graph using sums instead of
special_tfs_51_summed <-
clusters$cluster_5_diff_graphs[[1]] |>
calculate_tf_centralities(weights = abs(weight)) |>
normalize_centralities(
cluster_1_centralities = clusters$weighted_centralities[[5]],
cluster_2_centralities = clusters$weighted_centralities[[1]]
) |>
mutate(rank = rank(-normalized_degree)) |>
filter(node_name %in% special_tfs)
special_tfs_51_counted <-
unfiltered_graph_51 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
mutate(rank = rank(-normalized_degree)) |>
filter(node_name %in% special_tfs)
special_tfs_51_summed
## # A tibble: 3 × 3
## node_name normalized_degree rank
## <chr> <dbl> <dbl>
## 1 GATA6 1.74 92
## 2 NKX2-1 1.66 279
## 3 FOXA1 1.37 1055
special_tfs_51_counted
## # A tibble: 3 × 3
## node_name normalized_degree rank
## <chr> <dbl> <dbl>
## 1 NKX2-1 1.35 53
## 2 GATA6 1.34 188
## 3 FOXA1 1.31 1037
# plot histogram of all TFs
## summed
clusters$cluster_5_diff_graphs[[1]] |>
calculate_tf_centralities(weights = abs(weight)) |>
normalize_centralities(
cluster_1_centralities = clusters$weighted_centralities[[5]],
cluster_2_centralities = clusters$weighted_centralities[[1]]
) |>
mutate(rank = rank(-normalized_degree)) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_51_summed
) +
theme_bw() +
labs(
subtitle = "Summed weight graph; Cluster 1 (GATA-6) vs Cluster 5",
color = NULL
)
## counted
unfiltered_graph_51 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
mutate(rank = rank(-normalized_degree)) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_51_counted
) +
theme_bw() +
labs(
subtitle = "Unfiltered graph; Cluster 1 (GATA-6) vs Cluster 5",
color = NULL
)
# plot 50 most differential TFs
# plot normalized centralities of most differential TFs
unfiltered_graph_51 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[5]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf = if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 1 (vs. Cluster 5); counted",
x = "Normalized degree",
y = NULL,
fill = NULL
)
clusters$cluster_5_diff_graphs[[1]] |>
calculate_tf_centralities(weights = abs(weight)) |>
normalize_centralities(
cluster_1_centralities = clusters$weighted_centralities[[5]],
cluster_2_centralities = clusters$weighted_centralities[[1]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf =
if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 1 (vs. Cluster 5); summed",
x = "Normalized degree",
y = NULL,
fill = NULL
)
# calculate the ranking of the 3 special TFs in the normalized
# edge-subtracted graph using sums instead of
special_tfs_61_summed <-
clusters$cluster_6_diff_graphs[[1]] |>
calculate_tf_centralities(weights = abs(weight)) |>
normalize_centralities(
cluster_1_centralities = clusters$weighted_centralities[[6]],
cluster_2_centralities = clusters$weighted_centralities[[1]]
) |>
mutate(rank = rank(-normalized_degree)) |>
filter(node_name %in% special_tfs)
special_tfs_61_counted <-
unfiltered_graph_61 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[6]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
mutate(rank = rank(-normalized_degree)) |>
filter(node_name %in% special_tfs)
special_tfs_61_summed
## # A tibble: 3 × 3
## node_name normalized_degree rank
## <chr> <dbl> <dbl>
## 1 GATA6 1.74 462
## 2 NKX2-1 1.68 780
## 3 FOXA1 1.62 951
special_tfs_61_counted
## # A tibble: 3 × 3
## node_name normalized_degree rank
## <chr> <dbl> <dbl>
## 1 NKX2-1 1.35 53
## 2 GATA6 1.34 249
## 3 FOXA1 1.33 547
# plot histogram of all TFs
## summed
clusters$cluster_6_diff_graphs[[1]] |>
calculate_tf_centralities(weights = abs(weight)) |>
normalize_centralities(
cluster_1_centralities = clusters$weighted_centralities[[6]],
cluster_2_centralities = clusters$weighted_centralities[[1]]
) |>
mutate(rank = rank(-normalized_degree)) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_61_summed
) +
theme_bw() +
labs(
subtitle = "Summed weight graph; Cluster 1 (GATA-6) vs Cluster 6",
color = NULL
)
## counted
unfiltered_graph_61 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[6]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
mutate(rank = rank(-normalized_degree)) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_61_counted
) +
theme_bw() +
labs(
subtitle = "Unfiltered graph; Cluster 1 (GATA-6) vs Cluster 6",
color = NULL
)
# plot histogram of all TFs
## summed
compare_differential_cluster_networks(
diff_graph = clusters$cluster_6_diff_graphs[[1]],
cluster_1_centralities = clusters$weighted_centralities[[1]],
cluster_2_centralities = clusters$weighted_centralities[[6]],
weights = abs(weight)
) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_61_summed
) +
theme_bw() +
labs(
subtitle = "Summed weight graph; Cluster 1 (GATA-6) vs Cluster 6",
color = NULL
)
## counted
unfiltered_graph_61 |>
compare_differential_cluster_networks(
cluster_1_centralities = clusters$centralities[[6]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_61_counted
) +
theme_bw() +
labs(
subtitle = "Counted graph; Cluster 1 (GATA-6) vs Cluster 6",
color = NULL
)
# plot 50 most differential TFs
# plot normalized centralities of most differential TFs
unfiltered_graph_61 |>
calculate_tf_centralities() |>
normalize_centralities(
cluster_1_centralities = clusters$centralities[[6]],
cluster_2_centralities = clusters$centralities[[1]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf = if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 1 (vs. Cluster 6); counted",
x = "Normalized degree",
y = NULL,
fill = NULL
)
clusters$cluster_6_diff_graphs[[1]] |>
calculate_tf_centralities(weights = abs(weight)) |>
normalize_centralities(
cluster_1_centralities = clusters$weighted_centralities[[6]],
cluster_2_centralities = clusters$weighted_centralities[[1]]
) |>
slice_max(order_by = normalized_degree, n = 50) |>
mutate(
node_name = fct_reorder(node_name, normalized_degree),
special_tf =
if_else(node_name %in% special_tfs, "Special TF", "Not special TF")
) |>
ggplot(aes(x = normalized_degree, y = node_name, fill = special_tf)) +
geom_point(shape = 21) +
ggthemes::scale_fill_tableau() +
theme_bw() +
theme(axis.text.y = element_text(size = 5)) +
labs(
subtitle = "Top 50 differential TFs in Cluster 1 (vs. Cluster 6); summed",
x = "Normalized degree",
y = NULL,
fill = NULL
)
special_tfs_45_summed <-
compare_differential_cluster_networks(
diff_graph = clusters$cluster_5_diff_graphs[[4]],
cluster_1_centralities = clusters$weighted_centralities[[4]],
cluster_2_centralities = clusters$weighted_centralities[[5]],
weights = abs(weight)
) |>
filter(node_name %in% special_tfs)
compare_differential_cluster_networks(
diff_graph = clusters$cluster_5_diff_graphs[[4]],
cluster_1_centralities = clusters$weighted_centralities[[4]],
cluster_2_centralities = clusters$weighted_centralities[[5]],
weights = abs(weight)
) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_45_summed
) +
theme_bw() +
labs(
subtitle = "Summed graph; Cluster 4 (FOXA1) vs Cluster 5",
color = NULL
)
special_tfs_46_summed <-
compare_differential_cluster_networks(
diff_graph = clusters$cluster_6_diff_graphs[[4]],
cluster_1_centralities = clusters$weighted_centralities[[4]],
cluster_2_centralities = clusters$weighted_centralities[[6]],
weights = abs(weight)
) |>
filter(node_name %in% special_tfs)
compare_differential_cluster_networks(
diff_graph = clusters$cluster_6_diff_graphs[[4]],
cluster_1_centralities = clusters$weighted_centralities[[4]],
cluster_2_centralities = clusters$weighted_centralities[[6]],
weights = abs(weight)
) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_46_summed
) +
theme_bw() +
labs(
subtitle = "Summed graph; Cluster 4 (FOXA1) vs Cluster 6",
color = NULL
)
special_tfs_35_summed <-
compare_differential_cluster_networks(
diff_graph = clusters$cluster_5_diff_graphs[[3]],
cluster_1_centralities = clusters$weighted_centralities[[3]],
cluster_2_centralities = clusters$weighted_centralities[[5]],
weights = abs(weight)
) |>
filter(node_name %in% special_tfs)
compare_differential_cluster_networks(
diff_graph = clusters$cluster_5_diff_graphs[[3]],
cluster_1_centralities = clusters$weighted_centralities[[3]],
cluster_2_centralities = clusters$weighted_centralities[[5]],
weights = abs(weight)
) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_35_summed
) +
theme_bw() +
labs(
subtitle = "Summed graph; Cluster 3 (NKX2-1) vs Cluster 5",
color = NULL
)
special_tfs_36_summed <-
compare_differential_cluster_networks(
diff_graph = clusters$cluster_6_diff_graphs[[3]],
cluster_1_centralities = clusters$weighted_centralities[[3]],
cluster_2_centralities = clusters$weighted_centralities[[6]],
weights = abs(weight)
) |>
filter(node_name %in% special_tfs)
compare_differential_cluster_networks(
diff_graph = clusters$cluster_6_diff_graphs[[3]],
cluster_1_centralities = clusters$weighted_centralities[[3]],
cluster_2_centralities = clusters$weighted_centralities[[6]],
weights = abs(weight)
) |>
ggplot(aes(x = normalized_degree)) +
geom_histogram(bins = 30, color = "black", size = 0.5) +
geom_vline(
aes(xintercept = normalized_degree, color = node_name),
data = special_tfs_36_summed
) +
theme_bw() +
labs(
subtitle = "Summed graph; Cluster 3 (NKX2-1) vs Cluster 6",
color = NULL
)
TO DO: For the summed version, maybe it’s a good idea to express the edge-subtracted network as a percentage change or fold change of the second network compared to the first one instead of simply using the raw difference (so that all edges are on the same order of magnitude).
dir(input_path) |>
str_subset(pattern = "\\.mtx$")
## [1] "reprogramseq_HTO10_2_u.mtx" "reprogramseq_HTO10_2_v.mtx"
## [3] "reprogramseq_HTO6_4_u.mtx" "reprogramseq_HTO6_4_v.mtx"
## [5] "reprogramseq_HTO8_3_u.mtx" "reprogramseq_HTO8_3_v.mtx"
u_gata6 <-
file.path(input_path, "reprogramseq_HTO10_2_u.mtx") |>
Matrix::readMM()
v_gata6 <-
file.path(input_path, "reprogramseq_HTO10_2_v.mtx") |>
Matrix::readMM()